1 Setup

suppressPackageStartupMessages({
  library(DESeq2)
  library(dplyr)
  library(gplots)
  library(ggplot2)
  library(here)
  library(hyperSpec)
  library(limma)
  library(parallel)
  library(plotly)
  library(tibble)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
load(here("analysis/salmon/ZE-29Seed-dds.rda"))
levels(dds.29z$Experiment) <- c("ZE","Seed")

2 Normalise

vsd <- varianceStabilizingTransformation(dds.29z,blind=FALSE)

3 Batch effect

3.1 Estimation

mat <- assay(vsd)

We select FMG and ZE from mature seeds (29Seed), that correspond to Stage B8 and B9 of the ZE

sel <- dds.29z$Experiment == "Seed" | dds.29z$Time %in% c("B8","B9")

mat <- mat[,sel]

batch = dds.29z$Experiment[sel]

contrasts(batch) <- contr.sum(levels(batch))

design <- model.matrix(~batch)

fit <- lmFit(mat, design)

beta is the actual batch correction for each gene

beta <- fit$coefficients[, -1, drop = FALSE]
beta[is.na(beta)] <- 0

cross-validation with the removeBatchEffect function

stopifnot(all((mat - beta %*% t(design[,-1,drop=FALSE]))[1:6,1:6] == 
                removeBatchEffect(mat,dds.29z$Experiment[sel])[1:6,1:6]))

3.2 Correction

load(here("analysis/salmon/ZE-SE-dds.rda"))
levels(dds.sz$Experiment) <- c("ZE","SE")
vsd <-varianceStabilizingTransformation(dds.sz,blind=FALSE)

fullbatch <- vsd$Experiment
contrasts(fullbatch) <- contr.sum(levels(fullbatch))
fullbatch <- model.matrix(~fullbatch)[, -1, drop = FALSE]

assay(vsd) <- assay(vsd) - beta %*% t(fullbatch)

4 Quality Assessment

4.1 PCA

pc <- prcomp(t(assay(vsd)))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=2

An the number of possible combinations

nlevel=nlevels(vsd$Tissue) * nlevels(vsd$Time)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

4.1.1 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(vsd)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

4.1.2 Heatmap

Filter for noise - the filtering is meant to restrict the number of genes to be visualised to a reasonable amount that can be viewed in a limited amount of time

conds <- factor(paste(vsd$Time,vsd$Tissue))
sels <- rangeFeatureSelect(counts=assay(vsd),
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

vst.cutoff <- 6
  • Heatmap (>20k genes)
hm <- heatmap.2(t(scale(t(assay(vsd)[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds,cex=0.8)

5 Export

dir.create(here("analysis/batchCorrection"),showWarnings=FALSE)
save(vsd,file=here("analysis/batchCorrection/vsd.rda"))

6 Data sub-selection

Based on the visualisation above, we focus on ZE Stage 3-9 and SE Stage 3-6

sample.sel <- (vsd$Time %in% paste0("B",3:6) & vsd$Tissue =="SE") | 
  (vsd$Tissue=="ZE" & vsd$Time %in% paste0("B",3:9))

vsd <- vsd[,sample.sel]
vsd$Time <- droplevels(vsd$Time)

6.1 PCA

pc <- prcomp(t(assay(vsd)))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=2

An the number of possible combinations

nlevel=nlevels(vsd$Tissue) * nlevels(vsd$Time)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

6.1.1 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(vsd)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

6.2 Hierarchical clustering

conds <- factor(paste(vsd$Time,vsd$Tissue))
sels <- rangeFeatureSelect(counts=assay(vsd),
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot

vst.cutoff <- 4

hc <- hclust(pearson.dist(scale(t(assay(vsd)[sels[[vst.cutoff+1]],]))),
       method="ward.D2")

plot(hc,xlab="",sub="",labels=conds,cex=0.8)

7 Conclusion

The batch correction worked very nicely and while the “Tissue” Se vs. ZE still explains the variance in the first principal component, the second component explains the developmental series and is in agreement with the expectations, especially with regards to the maturation (SE, B3,4,5, ZE B3-B6) and desiccation (SE, B6 - ZE B7-9) time points.

8 SessionInfo

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] tibble_3.0.4                plotly_4.9.2.1             
##  [3] limma_3.46.0                hyperSpec_0.99-20201127    
##  [5] xml2_1.3.2                  lattice_0.20-41            
##  [7] here_1.0.0                  ggplot2_3.3.2              
##  [9] gplots_3.1.1                dplyr_1.0.2                
## [11] DESeq2_1.30.0               SummarizedExperiment_1.20.0
## [13] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [15] matrixStats_0.57.0          GenomicRanges_1.42.0       
## [17] GenomeInfoDb_1.26.1         IRanges_2.24.0             
## [19] S4Vectors_0.28.0            BiocGenerics_0.36.0        
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_4.0.5            RColorBrewer_1.1-2    
##  [4] httr_1.4.2             rprojroot_2.0.2        tools_4.0.3           
##  [7] R6_2.5.0               KernSmooth_2.23-18     DBI_1.1.0             
## [10] lazyeval_0.2.2         colorspace_2.0-0       withr_2.3.0           
## [13] tidyselect_1.1.0       bit_4.0.4              compiler_4.0.3        
## [16] Cairo_1.5-12.2         DelayedArray_0.16.0    labeling_0.4.2        
## [19] caTools_1.18.0         scales_1.1.1           genefilter_1.72.0     
## [22] stringr_1.4.0          digest_0.6.27          rmarkdown_2.5         
## [25] XVector_0.30.0         jpeg_0.1-8.1           pkgconfig_2.0.3       
## [28] htmltools_0.5.0        highr_0.8              htmlwidgets_1.5.2     
## [31] rlang_0.4.9            RSQLite_2.2.1          generics_0.1.0        
## [34] farver_2.0.3           jsonlite_1.7.1         crosstalk_1.1.0.1     
## [37] BiocParallel_1.24.1    gtools_3.8.2           RCurl_1.98-1.2        
## [40] magrittr_2.0.1         GenomeInfoDbData_1.2.4 Matrix_1.2-18         
## [43] Rcpp_1.0.5             munsell_0.5.0          lifecycle_0.2.0       
## [46] stringi_1.5.3          yaml_2.2.1             zlibbioc_1.36.0       
## [49] blob_1.2.1             crayon_1.3.4           splines_4.0.3         
## [52] annotate_1.68.0        locfit_1.5-9.4         knitr_1.30            
## [55] pillar_1.4.7           geneplotter_1.68.0     XML_3.99-0.5          
## [58] glue_1.4.2             evaluate_0.14          latticeExtra_0.6-29   
## [61] data.table_1.13.4      vctrs_0.3.5            png_0.1-7             
## [64] testthat_3.0.0         gtable_0.3.0           purrr_0.3.4           
## [67] tidyr_1.1.2            xfun_0.19              xtable_1.8-4          
## [70] survival_3.2-7         viridisLite_0.3.0      AnnotationDbi_1.52.0  
## [73] memoise_1.1.0          ellipsis_0.3.1